/*==============================================================================
FILE NAME: Explore_violation_severity.do
INPUTS: incidents_clean.dta, Notice_of_Violation_Clean.dta, Enforcements_Clean.dta,
OUTPUTS: IN_vio_cat.dta, 
UPDATED: 16 June 2022
==============================================================================*/

//explore complaint investigations and compare to non-complaint investigations

clear all

/* Set directory if working independently through code
if c(username)=="" { //insert username
	global rootdir "" // insert root path
	global processed_data "$rootdir/processed_data"  // Define global paths for replication package
} 
*/

//how likely are complaint-linked investigations to find a category A violation compared to non-complaint-linked investigations 
//note: we can't answer this directly using the investigations data file as that file does not have links to VNs so I'll use NOV and enforcements files 
//first identify complaint investigations that are not found in the investigations_clean.dta file. there are 959 of these.
//these will not show up in our analysis
use "$processed_data/incidents_clean.dta", clear
drop if IncidentStatus=="REFERRED"
keep if IN!=.
gen complaint_inv=1
label var complaint_inv "=1 if investigation is linked to complaint incident"
keep IN complaint_inv
duplicates drop
//IN uniquely identifies obs at this point
sort IN
save "$processed_data/temp_CIN.dta", replace
//this file identifies investigations that are linked to non-referred complaint incidents in the incidents data file
use "$processed_data/investigations_clean.dta", clear
destring IN, replace
keep IN
duplicates drop
sort IN
merge 1:1 IN using "$processed_data/temp_CIN.dta"
keep if _merge==2
drop _merge
gen nolink=1
label var nolink "=1 to identify a complaint-linked investigation not found in investigations file
save "$processed_data/nolink.dta", replace
//this file identifies investigations that are linked to air complaint incidents but are not found in the investigations data file


use "$processed_data/Notice_of_Violation_Clean.dta", clear
destring IN, replace
keep IN ViolationCategory
duplicates drop
gen catA=0
replace catA=1 if ViolationCategory=="A"
gen catB=0
replace catB=1 if ViolationCategory=="B"
gen catC=0
replace catC=1 if ViolationCategory=="C"
sort IN
collapse (sum) catA catB catC, by(IN)
isid IN
label var catA "=1 if investigation is linked to Category A violation"
label var catB "=1 if investigation is linked to Category B violation"
label var catC "=1 if investigation is linked to Category C violation"
sort IN
save "$processed_data/temp_NOV.dta", replace
//this file identifies investigations that show up in the NOV data file

use "$processed_data/Enforcements_Clean.dta", clear
destring IN, replace
keep IN ViolationCateogry
rename ViolationCateogry ViolationCategory
duplicates drop
gen catA=0
replace catA=1 if ViolationCategory=="A"
gen catB=0
replace catB=1 if ViolationCategory=="B"
gen catC=0
replace catC=1 if ViolationCategory=="C"
sort IN
collapse (sum) catA catB catC, by(IN)
isid IN
label var catA "=1 if investigation is linked to Category A violation"
label var catB "=1 if investigation is linked to Category B violation"
label var catC "=1 if investigation is linked to Category C violation"
sort IN
save "$processed_data/temp_NOE.dta", replace
//this file identifies investigations that show up in the NOE data file

use "$processed_data/temp_NOE.dta", clear
append using "$processed_data/temp_NOV.dta"
unique IN
collapse (max) catA catB catC, by(IN)
unique IN
sort IN
save "$processed_data/temp_vio_cat.dta", replace

use "$processed_data/investigations_clean.dta", clear
destring IN, replace
keep IN
duplicates drop
sort IN
save "$processed_data/temp_IN.dta", replace
//this file identifies all investigations in the investigations data file

//start with all investigations and merge with temp_vio_cat to identify investigations that resulted different types of violations
use "$processed_data/temp_IN.dta", clear
merge 1:1 IN using "$processed_data/temp_vio_cat.dta"
drop if _merge==2
drop _merge
//only 8 observations from vio_cat file fail to merge with IN file
sort IN
foreach x of varlist catA catB catC{
	replace `x'=0 if `x'==.
}
sort IN
save "$processed_data/IN_vio_cat.dta", replace

//distinguish between complaint investigations and non-complaint investigations
use "$processed_data/IN_vio_cat.dta", clear
merge 1:1 IN using "$processed_data/temp_CIN.dta"
replace complaint_inv=0 if _merge==1
//remove nolink complaint investigations; only 959 of them
//see Explore_Investiations.do for more info 
drop if _merge==2
drop _merge
sort IN
save "$processed_data/IN_vio_cat.dta", replace

//now link back to investigations data to get year of investigation
use "$processed_data/investigations_clean.dta", clear
destring IN, replace
label var year "$processed_data/investigation year"
sort IN
merge m:1 IN using "$processed_data/IN_vio_cat.dta"
drop _merge
keep IN complaint_inv catA catB catC year
gen any_cat=0
replace any_cat=1 if catA==1 | catB==1 | catC==1
duplicates drop
save "$processed_data/IN_vio_cat.dta", replace

erase "$processed_data/temp_IN.dta"
erase "$processed_data/temp_CIN.dta"
erase "$processed_data/nolink.dta"
erase "$processed_data/temp_NOV.dta"
erase "$processed_data/temp_NOE.dta"
erase "$processed_data/temp_vio_cat.dta"
